

<!DOCTYPE html>
<html lang="zh-CN" data-default-color-scheme=&#34;auto&#34;>



<head>
  <meta charset="UTF-8">
  <link rel="apple-touch-icon" sizes="76x76" href="/img/favicon.png">
  <link rel="icon" href="/img/favicon.png">
  <meta name="viewport"
        content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, shrink-to-fit=no">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  
  <meta name="theme-color" content="#2f4154">
  <meta name="description" content="沧海横流，尽显英雄本色；激浊扬清，正是猛士当时">
  <meta name="author" content="closer">
  <meta name="keywords" content="">
  
  <title>使用正则表达式 - closer的自留地</title>

  <link  rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap.min.css" />


  <link  rel="stylesheet" href="https://cdn.jsdelivr.net/npm/github-markdown-css@4.0.0/github-markdown.min.css" />
  <link  rel="stylesheet" href="/lib/hint/hint.min.css" />

  
    
    
      
      <link  rel="stylesheet" href="https://cdn.jsdelivr.net/npm/highlight.js@10.6.0/styles/androidstudio.min.css" />
    
  

  
    <link  rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css" />
  



<!-- 主题依赖的图标库，不要自行修改 -->

<link rel="stylesheet" href="//at.alicdn.com/t/font_1749284_ba1fz6golrf.css">



<link rel="stylesheet" href="//at.alicdn.com/t/font_1736178_kmeydafke9r.css">


<link  rel="stylesheet" href="/css/main.css" />

<!-- 自定义样式保持在最底部 -->


  <script id="fluid-configs">
    var Fluid = window.Fluid || {};
    var CONFIG = {"hostname":"blog.zsaa.top","root":"/","version":"1.8.10","typing":{"enable":true,"typeSpeed":70,"cursorChar":"_","loop":false},"anchorjs":{"enable":true,"element":"h1,h2,h3,h4,h5,h6","placement":"right","visible":"always","icon":""},"progressbar":{"enable":true,"height_px":3,"color":"#29d","options":{"showSpinner":false,"trickleSpeed":100}},"copy_btn":true,"image_zoom":{"enable":true,"img_url_replace":["",""]},"toc":{"enable":true,"headingSelector":"h1,h2,h3,h4,h5,h6","collapseDepth":0},"lazyload":{"enable":true,"loading_img":"/img/loading.gif","onlypost":false,"offset_factor":2},"web_analytics":{"enable":true,"baidu":"608f2baddd361128381ad2bf9377bf89","google":null,"gtag":null,"tencent":{"sid":null,"cid":null},"woyaola":null,"cnzz":null,"leancloud":{"app_id":"YzLqNtMw1YEwwACli1FUsIUM-gzGzoHsz","app_key":"HLUt5izfTvTcbEbOrA59W92a","server_url":"https://yzlqntmw.lc-cn-n1-shared.com"}}};
  </script>
  <script  src="/js/utils.js" ></script>
  <script  src="/js/color-schema.js" ></script>
<meta name="generator" content="Hexo 5.4.0"></head>


<body>
  <header style="height: 70vh;">
    <nav id="navbar" class="navbar fixed-top  navbar-expand-lg navbar-dark scrolling-navbar">
  <div class="container">
    <a class="navbar-brand"
       href="/">&nbsp;<strong>Hello</strong>&nbsp;</a>

    <button id="navbar-toggler-btn" class="navbar-toggler" type="button" data-toggle="collapse"
            data-target="#navbarSupportedContent"
            aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
      <div class="animated-icon"><span></span><span></span><span></span></div>
    </button>

    <!-- Collapsible content -->
    <div class="collapse navbar-collapse" id="navbarSupportedContent">
      <ul class="navbar-nav ml-auto text-center">
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/">
                <i class="iconfont icon-home-fill"></i>
                首页
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/archives/">
                <i class="iconfont icon-archive-fill"></i>
                归档
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/categories/">
                <i class="iconfont icon-category-fill"></i>
                分类
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/tags/">
                <i class="iconfont icon-tags-fill"></i>
                标签
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/about/">
                <i class="iconfont icon-user-fill"></i>
                关于
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/links/">
                <i class="iconfont icon-link-fill"></i>
                友链
              </a>
            </li>
          
        
        
          <li class="nav-item" id="search-btn">
            <a class="nav-link" target="_self" data-toggle="modal" data-target="#modalSearch">&nbsp;<i
                class="iconfont icon-search"></i>&nbsp;</a>
          </li>
        
        
          <li class="nav-item" id="color-toggle-btn">
            <a class="nav-link" target="_self">&nbsp;<i
                class="iconfont icon-dark" id="color-toggle-icon"></i>&nbsp;</a>
          </li>
        
      </ul>
    </div>
  </div>
</nav>

    <div class="banner" id="banner" parallax=true
         style="background: url('/img/default.jpg') no-repeat center center;
           background-size: cover;">
      <div class="full-bg-img">
        <div class="mask flex-center" style="background-color: rgba(0, 0, 0, 0.3)">
          <div class="page-header text-center fade-in-up">
            <span class="h2" id="subtitle" title="使用正则表达式">
              
            </span>

            
              <div class="mt-3">
  
  
    <span class="post-meta">
      <i class="iconfont icon-date-fill" aria-hidden="true"></i>
      <time datetime="2020-02-23 14:55" pubdate>
        2020年2月23日 下午
      </time>
    </span>
  
</div>

<div class="mt-1">
  
    
    <span class="post-meta mr-2">
      <i class="iconfont icon-chart"></i>
      3.1k 字
    </span>
  

  
    
    <span class="post-meta mr-2">
      <i class="iconfont icon-clock-fill"></i>
      
      
      36
       分钟
    </span>
  

  
  
    
      <!-- LeanCloud 统计文章PV -->
      <span id="leancloud-page-views-container" class="post-meta" style="display: none">
        <i class="iconfont icon-eye" aria-hidden="true"></i>
        <span id="leancloud-page-views"></span> 次
      </span>
    
  
</div>

            
          </div>

          
        </div>
      </div>
    </div>
  </header>

  <main>
    
      

<div class="container-fluid nopadding-x">
  <div class="row nomargin-x">
    <div class="d-none d-lg-block col-lg-2"></div>
    <div class="col-lg-8 nopadding-x-md">
      <div class="container nopadding-x-md" id="board-ctn">
        <div class="py-5" id="board">
          <article class="post-content mx-auto">
            <!-- SEO header -->
            <h1 style="display: none">使用正则表达式</h1>
            
              <p class="note note-info">
                
                  本文最后更新于：2020年4月24日 晚上
                
              </p>
            
            <div class="markdown-body">
              <blockquote>
<p><strong>本文转载于GitHub项目<a target="_blank" rel="noopener" href="https://github.com/jackfrued/Python-100-Days">Python - 100天从新手到大师</a></strong></p>
</blockquote>
<h3 id="一：-正则表达式相关知识"><a href="#一：-正则表达式相关知识" class="headerlink" title="一： 正则表达式相关知识"></a>一： 正则表达式相关知识</h3><p>在编写处理字符串的程序或网页时，经常会有查找符合某些复杂规则的字符串的需要，正则表达式就是用于描述这些规则的工具，换句话说正则表达式是一种工具，它定义了字符串的匹配模式（如何检查一个字符串是否有跟某种模式匹配的部分或者从一个字符串中将与模式匹配的部分提取出来或者替换掉）。如果你在Windows操作系统中使用过文件查找并且在指定文件名时使用过通配符（*和?），那么正则表达式也是与之类似的用来进行文本匹配的工具，只不过比起通配符正则表达式更强大，它能更精确地描述你的需求（当然你付出的代价是书写一个正则表达式比打出一个通配符要复杂得多，要知道任何给你带来好处的东西都是有代价的，就如同学习一门编程语言一样），比如你可以编写一个正则表达式，用来查找所有以0开头，后面跟着2-3个数字，然后是一个连字号“-”，最后是7或8位数字的字符串（像028-12345678或0813-7654321），这不就是国内的座机号码吗。最初计算机是为了做数学运算而诞生的，处理的信息基本上都是数值，而今天我们在日常工作中处理的信息基本上都是文本数据，我们希望计算机能够识别和处理符合某些模式的文本，正则表达式就显得非常重要了。今天几乎所有的编程语言都提供了对正则表达式操作的支持，Python通过标准库中的re模块来支持正则表达式操作。</p>
<p>我们可以考虑下面一个问题：我们从某个地方（可能是一个文本文件，也可能是网络上的一则新闻）获得了一个字符串，希望在字符串中找出手机号和座机号。当然我们可以设定手机号是11位的数字（注意并不是随机的11位数字，因为你没有见过“25012345678”这样的手机号吧）而座机号跟上一段中描述的模式相同，如果不使用正则表达式要完成这个任务就会很麻烦。</p>
<p>关于正则表达式的相关知识，大家可以阅读一篇非常有名的博客叫<a target="_blank" rel="noopener" href="https://deerchao.net/tutorials/regex/regex.htm">《正则表达式30分钟入门教程》</a>，读完这篇文章后你就可以看懂下面的表格，这是我们对正则表达式中的一些基本符号进行的扼要总结。</p>
<table>
<thead>
<tr>
<th>符号</th>
<th>解释</th>
<th>示例</th>
<th>说明</th>
</tr>
</thead>
<tbody><tr>
<td>.</td>
<td>匹配任意字符</td>
<td>b.t</td>
<td>可以匹配bat / but / b#t / b1t等</td>
</tr>
<tr>
<td>\w</td>
<td>匹配字母/数字/下划线</td>
<td>b\wt</td>
<td>可以匹配bat / b1t / b_t等<br>但不能匹配b#t</td>
</tr>
<tr>
<td>\s</td>
<td>匹配空白字符（包括\r、\n、\t等）</td>
<td>love\syou</td>
<td>可以匹配love you</td>
</tr>
<tr>
<td>\d</td>
<td>匹配数字</td>
<td>\d\d</td>
<td>可以匹配01 / 23 / 99等</td>
</tr>
<tr>
<td>\b</td>
<td>匹配单词的边界</td>
<td>\bThe\b</td>
<td></td>
</tr>
<tr>
<td>^</td>
<td>匹配字符串的开始</td>
<td>^The</td>
<td>可以匹配The开头的字符串</td>
</tr>
<tr>
<td>$</td>
<td>匹配字符串的结束</td>
<td>.exe$</td>
<td>可以匹配.exe结尾的字符串</td>
</tr>
<tr>
<td>\W</td>
<td>匹配非字母/数字/下划线</td>
<td>b\Wt</td>
<td>可以匹配b#t / b@t等<br>但不能匹配but / b1t / b_t等</td>
</tr>
<tr>
<td>\S</td>
<td>匹配非空白字符</td>
<td>love\Syou</td>
<td>可以匹配love#you等<br>但不能匹配love you</td>
</tr>
<tr>
<td>\D</td>
<td>匹配非数字</td>
<td>\d\D</td>
<td>可以匹配9a / 3# / 0F等</td>
</tr>
<tr>
<td>\B</td>
<td>匹配非单词边界</td>
<td>\Bio\B</td>
<td></td>
</tr>
<tr>
<td>[]</td>
<td>匹配来自字符集的任意单一字符</td>
<td>[aeiou]</td>
<td>可以匹配任一元音字母字符</td>
</tr>
<tr>
<td>[^]</td>
<td>匹配不在字符集中的任意单一字符</td>
<td>[^aeiou]</td>
<td>可以匹配任一非元音字母字符</td>
</tr>
<tr>
<td>*</td>
<td>匹配0次或多次</td>
<td>\w*</td>
<td></td>
</tr>
<tr>
<td>+</td>
<td>匹配1次或多次</td>
<td>\w+</td>
<td></td>
</tr>
<tr>
<td>?</td>
<td>匹配0次或1次</td>
<td>\w?</td>
<td></td>
</tr>
<tr>
<td>{N}</td>
<td>匹配N次</td>
<td>\w{3}</td>
<td></td>
</tr>
<tr>
<td>{M,}</td>
<td>匹配至少M次</td>
<td>\w{3,}</td>
<td></td>
</tr>
<tr>
<td>{M,N}</td>
<td>匹配至少M次至多N次</td>
<td>\w{3,6}</td>
<td></td>
</tr>
<tr>
<td>|</td>
<td>分支</td>
<td>foo|bar</td>
<td>可以匹配foo或者bar</td>
</tr>
<tr>
<td>(?#)</td>
<td>注释</td>
<td></td>
<td></td>
</tr>
<tr>
<td>(exp)</td>
<td>匹配exp并捕获到自动命名的组中</td>
<td></td>
<td></td>
</tr>
<tr>
<td>(?&nbsp;&lt;name&gt;exp)</td>
<td>匹配exp并捕获到名为name的组中</td>
<td></td>
<td></td>
</tr>
<tr>
<td>(?:exp)</td>
<td>匹配exp但是不捕获匹配的文本</td>
<td></td>
<td></td>
</tr>
<tr>
<td>(?=exp)</td>
<td>匹配exp前面的位置</td>
<td>\b\w+(?=ing)</td>
<td>可以匹配I’m dancing中的danc</td>
</tr>
<tr>
<td>(?&lt;=exp)</td>
<td>匹配exp后面的位置</td>
<td>(?&lt;=\bdanc)\w+\b</td>
<td>可以匹配I love dancing and reading中的第一个ing</td>
</tr>
<tr>
<td>(?!exp)</td>
<td>匹配后面不是exp的位置</td>
<td></td>
<td></td>
</tr>
<tr>
<td>(?&lt;!exp)</td>
<td>匹配前面不是exp的位置</td>
<td></td>
<td></td>
</tr>
<tr>
<td>*?</td>
<td>重复任意次，但尽可能少重复</td>
<td>a.*b<br>a.*?b</td>
<td>将正则表达式应用于aabab，前者会匹配整个字符串aabab，后者会匹配aab和ab两个字符串</td>
</tr>
<tr>
<td>+?</td>
<td>重复1次或多次，但尽可能少重复</td>
<td></td>
<td></td>
</tr>
<tr>
<td>??</td>
<td>重复0次或1次，但尽可能少重复</td>
<td></td>
<td></td>
</tr>
<tr>
<td>{M,N}?</td>
<td>重复M到N次，但尽可能少重复</td>
<td></td>
<td></td>
</tr>
<tr>
<td>{M,}?</td>
<td>重复M次以上，但尽可能少重复</td>
<td></td>
<td></td>
</tr>
</tbody></table>
<blockquote>
<p><strong>说明：</strong> 如果需要匹配的字符是正则表达式中的特殊字符，那么可以使用\进行转义处理，例如想匹配小数点可以写成\.就可以了，因为直接写.会匹配任意字符；同理，想匹配圆括号必须写成\(和\)，否则圆括号被视为正则表达式中的分组。</p>
</blockquote>
<h3 id="二：-Python对正则表达式的支持"><a href="#二：-Python对正则表达式的支持" class="headerlink" title="二： Python对正则表达式的支持"></a>二： Python对正则表达式的支持</h3><p>Python提供了re模块来支持正则表达式相关操作，下面是re模块中的核心函数。</p>
<table>
<thead>
<tr>
<th>函数</th>
<th>说明</th>
</tr>
</thead>
<tbody><tr>
<td>compile(pattern, flags=0)</td>
<td>编译正则表达式返回正则表达式对象</td>
</tr>
<tr>
<td>match(pattern, string, flags=0)</td>
<td>用正则表达式匹配字符串 成功返回匹配对象 否则返回None</td>
</tr>
<tr>
<td>search(pattern, string, flags=0)</td>
<td>搜索字符串中第一次出现正则表达式的模式 成功返回匹配对象 否则返回None</td>
</tr>
<tr>
<td>split(pattern, string, maxsplit=0, flags=0)</td>
<td>用正则表达式指定的模式分隔符拆分字符串 返回列表</td>
</tr>
<tr>
<td>sub(pattern, repl, string, count=0, flags=0)</td>
<td>用指定的字符串替换原字符串中与正则表达式匹配的模式 可以用count指定替换的次数</td>
</tr>
<tr>
<td>fullmatch(pattern, string, flags=0)</td>
<td>match函数的完全匹配（从字符串开头到结尾）版本</td>
</tr>
<tr>
<td>findall(pattern, string, flags=0)</td>
<td>查找字符串所有与正则表达式匹配的模式 返回字符串的列表</td>
</tr>
<tr>
<td>finditer(pattern, string, flags=0)</td>
<td>查找字符串所有与正则表达式匹配的模式 返回一个迭代器</td>
</tr>
<tr>
<td>purge()</td>
<td>清除隐式编译的正则表达式的缓存</td>
</tr>
<tr>
<td>re.I / re.IGNORECASE</td>
<td>忽略大小写匹配标记</td>
</tr>
<tr>
<td>re.M / re.MULTILINE</td>
<td>多行匹配标记</td>
</tr>
</tbody></table>
<blockquote>
<p><strong>说明：</strong> 上面提到的re模块中的这些函数，实际开发中也可以用正则表达式对象的方法替代对这些函数的使用，如果一个正则表达式需要重复的使用，那么先通过compile函数编译正则表达式并创建出正则表达式对象无疑是更为明智的选择。</p>
</blockquote>
<p>下面我们通过一系列的例子来告诉大家在Python中如何使用正则表达式。</p>
<h4 id="例子1：验证输入用户名和QQ号是否有效并给出对应的提示信息。"><a href="#例子1：验证输入用户名和QQ号是否有效并给出对应的提示信息。" class="headerlink" title="例子1：验证输入用户名和QQ号是否有效并给出对应的提示信息。"></a>例子1：验证输入用户名和QQ号是否有效并给出对应的提示信息。</h4><figure class="highlight python"><table><tr><td class="gutter hljs"><div class="hljs code-wrapper"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre></div></td><td class="code"><div class="hljs code-wrapper"><pre><code class="hljs Python"><span class="hljs-string">&quot;&quot;&quot;</span><br><span class="hljs-string">验证输入用户名和QQ号是否有效并给出对应的提示信息</span><br><span class="hljs-string"></span><br><span class="hljs-string">要求：用户名必须由字母、数字或下划线构成且长度在6~20个字符之间，QQ号是5~12的数字且首位不能为0</span><br><span class="hljs-string">&quot;&quot;&quot;</span><br><span class="hljs-keyword">import</span> re<br><br><br><span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">main</span>():</span><br>    username = <span class="hljs-built_in">input</span>(<span class="hljs-string">&#x27;请输入用户名: &#x27;</span>)<br>    qq = <span class="hljs-built_in">input</span>(<span class="hljs-string">&#x27;请输入QQ号: &#x27;</span>)<br>    <span class="hljs-comment"># match函数的第一个参数是正则表达式字符串或正则表达式对象</span><br>    <span class="hljs-comment"># 第二个参数是要跟正则表达式做匹配的字符串对象</span><br>    m1 = re.match(<span class="hljs-string">r&#x27;^[0-9a-zA-Z_]&#123;6,20&#125;$&#x27;</span>, username)<br>    <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> m1:<br>        <span class="hljs-built_in">print</span>(<span class="hljs-string">&#x27;请输入有效的用户名.&#x27;</span>)<br>    m2 = re.match(<span class="hljs-string">r&#x27;^[1-9]\d&#123;4,11&#125;$&#x27;</span>, qq)<br>    <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> m2:<br>        <span class="hljs-built_in">print</span>(<span class="hljs-string">&#x27;请输入有效的QQ号.&#x27;</span>)<br>    <span class="hljs-keyword">if</span> m1 <span class="hljs-keyword">and</span> m2:<br>        <span class="hljs-built_in">print</span>(<span class="hljs-string">&#x27;你输入的信息是有效的!&#x27;</span>)<br><br><br><span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&#x27;__main__&#x27;</span>:<br>    main()<br></code></pre></div></td></tr></table></figure>

<blockquote>
<p><strong>提示：</strong> 上面在书写正则表达式时使用了“原始字符串”的写法（在字符串前面加上了r），所谓“原始字符串”就是字符串中的每个字符都是它原始的意义，说得更直接一点就是字符串中没有所谓的转义字符啦。因为正则表达式中有很多元字符和需要进行转义的地方，如果不使用原始字符串就需要将反斜杠写作\\，例如表示数字的\d得书写成\\d，这样不仅写起来不方便，阅读的时候也会很吃力。</p>
</blockquote>
<h4 id="例子2：从一段文字中提取出国内手机号码。"><a href="#例子2：从一段文字中提取出国内手机号码。" class="headerlink" title="例子2：从一段文字中提取出国内手机号码。"></a>例子2：从一段文字中提取出国内手机号码。</h4><p>下面这张图是截止到2017年底，国内三家运营商推出的手机号段。</p>
<p><img src="https://closer_laps.coding.net/p/picture/d/picture/git/raw/master/%E8%AF%BB%E4%B9%A6%E7%AC%94%E8%AE%B0/Python/tel-start-number.png" srcset="/img/loading.gif" lazyload alt="tel-start-number.png"></p>
<figure class="highlight python"><table><tr><td class="gutter hljs"><div class="hljs code-wrapper"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br></pre></div></td><td class="code"><div class="hljs code-wrapper"><pre><code class="hljs Python"><span class="hljs-keyword">import</span> re<br><br><br><span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">main</span>():</span><br>    <span class="hljs-comment"># 创建正则表达式对象 使用了前瞻和回顾来保证手机号前后不应该出现数字</span><br>    pattern = re.<span class="hljs-built_in">compile</span>(<span class="hljs-string">r&#x27;(?&lt;=\D)1[34578]\d&#123;9&#125;(?=\D)&#x27;</span>)<br>    sentence = <span class="hljs-string">&#x27;&#x27;&#x27;</span><br><span class="hljs-string">    重要的事情说8130123456789遍，我的手机号是13512346789这个靓号，</span><br><span class="hljs-string">    不是15600998765，也是110或119，王大锤的手机号才是15600998765。</span><br><span class="hljs-string">    &#x27;&#x27;&#x27;</span><br>    <span class="hljs-comment"># 查找所有匹配并保存到一个列表中</span><br>    mylist = re.findall(pattern, sentence)<br>    <span class="hljs-built_in">print</span>(mylist)<br>    <span class="hljs-built_in">print</span>(<span class="hljs-string">&#x27;--------华丽的分隔线--------&#x27;</span>)<br>    <span class="hljs-comment"># 通过迭代器取出匹配对象并获得匹配的内容</span><br>    <span class="hljs-keyword">for</span> temp <span class="hljs-keyword">in</span> pattern.finditer(sentence):<br>        <span class="hljs-built_in">print</span>(temp.group())<br>    <span class="hljs-built_in">print</span>(<span class="hljs-string">&#x27;--------华丽的分隔线--------&#x27;</span>)<br>    <span class="hljs-comment"># 通过search函数指定搜索位置找出所有匹配</span><br>    m = pattern.search(sentence)<br>    <span class="hljs-keyword">while</span> m:<br>        <span class="hljs-built_in">print</span>(m.group())<br>        m = pattern.search(sentence, m.end())<br><br><br><span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&#x27;__main__&#x27;</span>:<br>    main()<br></code></pre></div></td></tr></table></figure>

<blockquote>
<p><strong>说明：</strong> 上面匹配国内手机号的正则表达式并不够好，因为像14开头的号码只有145或147，而上面的正则表达式并没有考虑这种情况，要匹配国内手机号，更好的正则表达式的写法是：<code>(?&lt;=\D)(1[38]\d&#123;9&#125;|14[57]\d&#123;8&#125;|15[0-35-9]\d&#123;8&#125;|17[678]\d&#123;8&#125;)(?=\D)</code>，国内最近好像有19和16开头的手机号了，但是这个暂时不在我们考虑之列。</p>
</blockquote>
<h4 id="例子3：替换字符串中的不良内容"><a href="#例子3：替换字符串中的不良内容" class="headerlink" title="例子3：替换字符串中的不良内容"></a>例子3：替换字符串中的不良内容</h4><figure class="highlight python"><table><tr><td class="gutter hljs"><div class="hljs code-wrapper"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></div></td><td class="code"><div class="hljs code-wrapper"><pre><code class="hljs Python"><span class="hljs-keyword">import</span> re<br><br><br><span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">main</span>():</span><br>    sentence = <span class="hljs-string">&#x27;你丫是傻叉吗? 我操你大爷的. Fuck you.&#x27;</span><br>    purified = re.sub(<span class="hljs-string">&#x27;[操肏艹]|fuck|shit|傻[比屄逼叉缺吊屌]|煞笔&#x27;</span>,<br>                      <span class="hljs-string">&#x27;*&#x27;</span>, sentence, flags=re.IGNORECASE)<br>    <span class="hljs-built_in">print</span>(purified)  <span class="hljs-comment"># 你丫是*吗? 我*你大爷的. * you.</span><br><br><br><span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&#x27;__main__&#x27;</span>:<br>    main()<br></code></pre></div></td></tr></table></figure>

<blockquote>
<p><strong>说明：</strong> re模块的正则表达式相关函数中都有一个flags参数，它代表了正则表达式的匹配标记，可以通过该标记来指定匹配时是否忽略大小写、是否进行多行匹配、是否显示调试信息等。如果需要为flags参数指定多个值，可以使用<a target="_blank" rel="noopener" href="http://www.runoob.com/python/python-operators.html#ysf5">按位或运算符</a>进行叠加，如<code>flags=re.I | re.M</code>。</p>
</blockquote>
<h4 id="例子4：拆分长字符串"><a href="#例子4：拆分长字符串" class="headerlink" title="例子4：拆分长字符串"></a>例子4：拆分长字符串</h4><figure class="highlight python"><table><tr><td class="gutter hljs"><div class="hljs code-wrapper"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></div></td><td class="code"><div class="hljs code-wrapper"><pre><code class="hljs Python"><span class="hljs-keyword">import</span> re<br><br><br><span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">main</span>():</span><br>    poem = <span class="hljs-string">&#x27;窗前明月光，疑是地上霜。举头望明月，低头思故乡。&#x27;</span><br>    sentence_list = re.split(<span class="hljs-string">r&#x27;[，。, .]&#x27;</span>, poem)<br>    <span class="hljs-keyword">while</span> <span class="hljs-string">&#x27;&#x27;</span> <span class="hljs-keyword">in</span> sentence_list:<br>        sentence_list.remove(<span class="hljs-string">&#x27;&#x27;</span>)<br>    <span class="hljs-built_in">print</span>(sentence_list)  <span class="hljs-comment"># [&#x27;窗前明月光&#x27;, &#x27;疑是地上霜&#x27;, &#x27;举头望明月&#x27;, &#x27;低头思故乡&#x27;]</span><br><br><br><span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&#x27;__main__&#x27;</span>:<br>    main()<br></code></pre></div></td></tr></table></figure>

<h3 id="三：-后话"><a href="#三：-后话" class="headerlink" title="三： 后话"></a>三： 后话</h3><p>如果要从事爬虫类应用的开发，那么正则表达式一定是一个非常好的助手，因为它可以帮助我们迅速的从网页代码中发现某种我们指定的模式并提取出我们需要的信息，当然对于初学者来收，要编写一个正确的适当的正则表达式可能并不是一件容易的事情（当然有些常用的正则表达式可以直接在网上找找），所以实际开发爬虫应用的时候，有很多人会选择<a target="_blank" rel="noopener" href="https://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a>或<a target="_blank" rel="noopener" href="http://lxml.de/">Lxml</a>来进行匹配和信息的提取，前者简单方便但是性能较差，后者既好用性能也好，但是安装稍嫌麻烦。</p>
<h3 id="四：-文献及工具"><a href="#四：-文献及工具" class="headerlink" title="四： 文献及工具"></a>四： 文献及工具</h3><p><a target="_blank" rel="noopener" href="https://regex101.com/">在线正则表达式：https://regex101.com/</a><br><a target="_blank" rel="noopener" href="https://regex101.com/settings">设置里面可以切换语言</a></p>
<p><a target="_blank" rel="noopener" href="https://gist.github.com/JavaCS3/e36e494e78a02049950bfa7c7ebeb929">实用文档：https://gist.github.com/JavaCS3/e36e494e78a02049950bfa7c7ebeb929</a></p>
<p><a target="_blank" rel="noopener" href="https://www.bilibili.com/video/BV1ef4y1U7V4">视频教程：https://www.bilibili.com/video/BV1ef4y1U7V4</a></p>

            </div>
            <hr>
            <div>
              <div class="post-metas mb-3">
                
                  <div class="post-meta mr-3">
                    <i class="iconfont icon-category"></i>
                    
                      <a class="hover-with-bg" href="/categories/python/">python</a>
                    
                      <a class="hover-with-bg" href="/categories/python/%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F/">正则表达式</a>
                    
                  </div>
                
                
                  <div class="post-meta">
                    <i class="iconfont icon-tags"></i>
                    
                      <a class="hover-with-bg" href="/tags/python/">python</a>
                    
                      <a class="hover-with-bg" href="/tags/%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F/">正则表达式</a>
                    
                  </div>
                
              </div>
              
                <p class="note note-warning">
                  
                    本博客所有文章除特别声明外，均采用 <a target="_blank" href="https://creativecommons.org/licenses/by-sa/4.0/deed.zh" rel="nofollow noopener noopener">CC BY-SA 4.0 协议</a> ，转载请注明出处！
                  
                </p>
              
              
                <div class="post-prevnext">
                  <article class="post-prev col-6">
                    
                    
                      <a href="/2020/02/23/%E8%AF%BB%E4%B9%A6%E7%AC%94%E8%AE%B0/Linux/%E7%8E%A9%E8%BD%ACLinux%E6%93%8D%E4%BD%9C%E7%B3%BB%E7%BB%9F/">
                        <i class="iconfont icon-arrowleft"></i>
                        <span class="hidden-mobile">玩转Linux操作系统</span>
                        <span class="visible-mobile">上一篇</span>
                      </a>
                    
                  </article>
                  <article class="post-next col-6">
                    
                    
                      <a href="/2020/02/23/%E8%AF%BB%E4%B9%A6%E7%AC%94%E8%AE%B0/Python/python%E5%A4%84%E7%90%86excle%E5%AE%9E%E4%BE%8B%E4%B9%8B%E5%AD%97%E7%AC%A6%E4%B8%B2%E5%87%8F%E6%B3%95/">
                        <span class="hidden-mobile">python处理excle实例之字符串减法</span>
                        <span class="visible-mobile">下一篇</span>
                        <i class="iconfont icon-arrowright"></i>
                      </a>
                    
                  </article>
                </div>
              
            </div>

            
              <!-- Comments -->
              <article class="comments" id="comments" lazyload>
                
                  
                
                
  <div id="valine"></div>
  <script type="text/javascript">
    Fluid.utils.loadComments('#valine', function() {
      Fluid.utils.createScript('https://cdn.jsdelivr.net/npm/valine@1.4.14/dist/Valine.min.js', function () {
        new Valine({
          el: "#valine",
          app_id: "YzLqNtMw1YEwwACli1FUsIUM-gzGzoHsz",
          app_key: "HLUt5izfTvTcbEbOrA59W92a",
          placeholder: "畅所欲言...",
          path: window.location.pathname,
          avatar: "robohash",
          meta: ["nick","mail","link"],
          pageSize: "10",
          lang: "zh-CN",
          highlight: true,
          recordIP: false,
          serverURLs: "",
        });
      });
    });
  </script>
  <noscript>Please enable JavaScript to view the comments</noscript>


              </article>
            
          </article>
        </div>
      </div>
    </div>
    
      <div class="d-none d-lg-block col-lg-2 toc-container" id="toc-ctn">
        <div id="toc">
  <p class="toc-header"><i class="iconfont icon-list"></i>&nbsp;目录</p>
  <div class="toc-body" id="toc-body"></div>
</div>

      </div>
    
  </div>
</div>

<!-- Custom -->

  <div class="col-lg-7 mx-auto nopadding-x-md">
    <div class="container custom post-custom mx-auto">
      <img src="https://closer_laps.coding.net/p/picture/d/picture/git/raw/master/pay/pay.png" srcset="/img/loading.gif" lazyload class="rounded mx-auto d-block mt-3" style="width:355.4px; height:200px;">
    </div>
  </div>


    

    
      <a id="scroll-top-button" href="#" role="button">
        <i class="iconfont icon-arrowup" aria-hidden="true"></i>
      </a>
    

    
      <div class="modal fade" id="modalSearch" tabindex="-1" role="dialog" aria-labelledby="ModalLabel"
     aria-hidden="true">
  <div class="modal-dialog modal-dialog-scrollable modal-lg" role="document">
    <div class="modal-content">
      <div class="modal-header text-center">
        <h4 class="modal-title w-100 font-weight-bold">搜索</h4>
        <button type="button" id="local-search-close" class="close" data-dismiss="modal" aria-label="Close">
          <span aria-hidden="true">&times;</span>
        </button>
      </div>
      <div class="modal-body mx-3">
        <div class="md-form mb-5">
          <input type="text" id="local-search-input" class="form-control validate">
          <label data-error="x" data-success="v"
                 for="local-search-input">关键词</label>
        </div>
        <div class="list-group" id="local-search-result"></div>
      </div>
    </div>
  </div>
</div>
    

    
  </main>

  <footer class="text-center mt-5 py-3">
  <div class="footer-content">
     <a href="https://hexo.io" target="_blank" rel="nofollow noopener"><span>Hexo</span></a> <i class="iconfont icon-love"></i> <a href="https://github.com/fluid-dev/hexo-theme-fluid" target="_blank" rel="nofollow noopener"><span>Fluid</span></a> 
  </div>
  
  <div class="statistics">
    
    

    
      
        <!-- LeanCloud 统计PV -->
        <span id="leancloud-site-pv-container" style="display: none">
            总访问量 
            <span id="leancloud-site-pv"></span>
             次
          </span>
      
      
        <!-- LeanCloud 统计UV -->
        <span id="leancloud-site-uv-container" style="display: none">
            总访客数 
            <span id="leancloud-site-uv"></span>
             人
          </span>
      

    
  </div>


  
  <!-- 备案信息 -->
  <div class="beian">
    <span>
      <a href="http://beian.miit.gov.cn/" target="_blank" rel="nofollow noopener">
        苏ICP备20032307号
      </a>
    </span>
    
      
        <span>
          <a
            href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=32020602001023"
            rel="nofollow noopener"
            class="beian-police"
            target="_blank"
          >
            
              <span style="visibility: hidden; width: 0">|</span>
              <img src="/img/police_beian.png" srcset="/img/loading.gif" lazyload alt="police-icon"/>
            
            <span>苏公网安备 32020602001023号</span>
          </a>
        </span>
      
    
  </div>


  
</footer>


  <!-- SCRIPTS -->
  
  <script  src="https://cdn.jsdelivr.net/npm/nprogress@0.2.0/nprogress.min.js" ></script>
  <link  rel="stylesheet" href="https://cdn.jsdelivr.net/npm/nprogress@0.2.0/nprogress.min.css" />

  <script>
    NProgress.configure({"showSpinner":false,"trickleSpeed":100})
    NProgress.start()
    window.addEventListener('load', function() {
      NProgress.done();
    })
  </script>


<script  src="https://cdn.jsdelivr.net/npm/jquery@3.6.0/dist/jquery.min.js" ></script>
<script  src="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/js/bootstrap.min.js" ></script>
<script  src="/js/events.js" ></script>
<script  src="/js/plugins.js" ></script>

<!-- Plugins -->


  
    <script  src="/js/img-lazyload.js" ></script>
  



  



  <script  src="https://cdn.jsdelivr.net/npm/tocbot@4.12.2/dist/tocbot.min.js" ></script>



  <script  src="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.js" ></script>



  <script  src="https://cdn.jsdelivr.net/npm/anchor-js@4.3.0/anchor.min.js" ></script>



  <script defer src="https://cdn.jsdelivr.net/npm/clipboard@2.0.8/dist/clipboard.min.js" ></script>




  <script defer src="/js/leancloud.js" ></script>



  <script  src="https://cdn.jsdelivr.net/npm/typed.js@2.0.11/lib/typed.min.js" ></script>
  <script>
    (function (window, document) {
      var typing = Fluid.plugins.typing;
      var title = document.getElementById('subtitle').title;
      
      typing(title)
      
    })(window, document);
  </script>



  <script  src="/js/local-search.js" ></script>
  <script>
    (function () {
      var path = "/local-search.xml";
      $('#local-search-input').on('click', function() {
        searchFunc(path, 'local-search-input', 'local-search-result');
      });
      $('#modalSearch').on('shown.bs.modal', function() {
        $('#local-search-input').focus();
      });
    })()
  </script>





  

  
    <!-- MathJax -->
    <script>
      MathJax = {
        tex: {
          inlineMath: [['$', '$'], ['\\(', '\\)']]
        },
        options: {
          renderActions: {
            findScript: [10, doc => {
              document.querySelectorAll('script[type^="math/tex"]').forEach(node => {
                const display = !!node.type.match(/; *mode=display/);
                const math = new doc.options.MathItem(node.textContent, doc.inputJax[0], display);
                const text = document.createTextNode('');
                node.parentNode.replaceChild(text, node);
                math.start = { node: text, delim: '', n: 0 };
                math.end = { node: text, delim: '', n: 0 };
                doc.math.push(math);
              });
            }, '', false],
            insertedScript: [200, () => {
              document.querySelectorAll('mjx-container').forEach(node => {
                let target = node.parentNode;
                if (target.nodeName.toLowerCase() === 'li') {
                  target.parentNode.classList.add('has-jax');
                }
              });
            }, '', false]
          }
        }
      };
    </script>

    <script async src="https://cdn.jsdelivr.net/npm/mathjax@3.1.2/es5/tex-svg.js" ></script>

  








  
    <!-- Baidu Analytics -->
    <script defer>
      var _hmt = _hmt || [];
      (function () {
        var hm = document.createElement("script");
        hm.src = "https://hm.baidu.com/hm.js?608f2baddd361128381ad2bf9377bf89";
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(hm, s);
      })();
    </script>
  

  

  

  

  

  





<!-- 主题的启动项 保持在最底部 -->
<script  src="/js/boot.js" ></script>


</body>
</html>
